gusucode.com > 《MATLAB图像与视频处理实用案例详解》代码 > 《MATLAB图像与视频处理实用案例详解》代码/第 19 章 基于语音识别的信号灯图像模拟控制技术/voicebox/fxpefac.m
function [fx,tx,pv,fv]=fxpefac(s,fs,tinc,m,pp) %FXPEFAC PEFAC pitch tracker [FX,TT,PV,FV]=(S,FS,TINC,M,PP) % % Input: s(ns) Speech signal % fs Sample frequency (Hz) % tinc Time increment between frames (s) [0.01] % or [start increment end] % m mode % 'g' plot graph showing waveform and pitch % 'G' plot spectrogram with superimposed pitch % 'x' use external files for algorithm parameter % initialization: fxpefac_g and fxpefac_w % pp structure containing algorithm parameters % % Outputs: fx(nframe) Estimated pitch (Hz) % tx(nframe) Time at the centre of each frame (seconds). % pv(nframe) Probability of the frame of being voiced % fv structure containing feature vectors % fv.vuvfea(nframe,2) = voiced/unvoiced GMM features % References % [1] S.Gonzalez and M. Brookes, % A pitch estimation filter robust to high levels of noise (PEFAC), Proc EUSIPCO,Aug 2011. % Bugs/Suggestions % (1) do long files in chunks % (2) option of n-best DP % Copyright (C) Sira Gonzalez and Mike Brookes 2011 % Version: $Id: fxpefac.m,v 1.2 2011/07/27 07:22:07 dmb Exp $ % % VOICEBOX is a MATLAB toolbox for speech processing. % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % This program is free software; you can redistribute it and/or modify % it under the terms of the GNU General Public License as published by % the Free Software Foundation; either version 2 of the License, or % (at your option) any later version. % % This program is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % % You can obtain a copy of the GNU General Public License from % http://www.gnu.org/copyleft/gpl.html or by writing to % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% persistent w_u m_u v_u w_v m_v v_v dpwtdef % initialize persistent variables if ~numel(w_u) % voiced/unvoiced decision based on 2-element feature vector % (a) mean power of the frame's log-freq spectrum (normalized so its short-term average is LTASS) % (b) sum of the power in the first three peaks %===== VUV if nargin>3 && any(m=='x') fxpefac_g; % read in GMM parameters fxpefac_w; % read in Weights parameters else w_u=[0.2123723 0.207788 0.2701817 0.1293616 0.04741722 0.1328791 ]'; m_u=[0.2220388 0.4067706 ; 0.04567656 0.4016914 ; 0.8415278 0.3192158 ; 0.2194808 0.1910079 ; 1.6347 0.5819833 ; 1.181519 0.6996485 ]; v_u=reshape([0.01413822 0.003357913 0.003357913 0.01786169 ; 0.0009377269 0.0006220489 0.0006220489 0.03422057 ; 0.1233703 0.004299293 0.004299293 0.007660504 ; 0.01779449 0.002078821 0.002078821 0.001605052 ; 1.110173 0.00718649 0.00718649 0.005734435 ; 0.5477135 -0.00182316 -0.00182316 0.05659796 ]',[2 2 6]); w_v=[0.07758689 0.2109879 0.1856225 0.06853158 0.2701563 0.1871148 ]'; m_v=[1.208656 0.3365564 ; 1.216643 0.5971916 ; 4.08585 1.240948 ; 8.322102 1.349939 ; 1.734108 1.168643 ; 0.5107205 0.940308 ]; v_v=reshape([0.06181574 0.002950501 0.002950501 0.004528442 ; 0.2946077 0.01433284 0.01433284 0.02684239 ; 2.508473 -0.03310555 -0.03310555 0.1098579 ; 14.17252 -0.09009174 -0.09009174 0.07989255 ; 0.5834894 -0.07854027 -0.07854027 0.1108958 ; 0.05978017 0.005528601 0.005528601 0.1309329 ]',[2 2 6]); end %===== PDP % dfm = -0.4238; % df mean % dfv = 3.8968; % df variance (although treated as std dev here) % delta = 0.15; % dflpso=[dfm 0.5/(log(10)*dfv^2) -log(2*delta/(dfv*sqrt(2*pi)))/log(10)]; % scale factor & offset for df pdf % dpwtdef=[1.0000, 0.8250, 1.3064, 1.9863]; % default DP weights dpwtdef=[1.0000, 0.8250, 0.01868, 0.006773, 98.9, -0.4238]; % default DP weights %===== END end % Algorithm parameter defaults p.fstep=5; % frequency resolution of initial spectrogram (Hz) p.fmax=4000; % maximum frequency of initial spectrogram (Hz) p.fres = 20; % bandwidth of initial spectrogram (Hz) p.fbanklo = 40; % low frequency limit of log filterbank (Hz) p.mpsmooth = 201; % width of smoothing filter for mean power % p.maxtranf = 1000; % maximum value of tranf cost term p.shortut = 7; % max utterance length to average power of entire utterance p.pefact = 1.5; % shape factor in PEFAC filter p.numopt = 3; % number of possible frequencies per frame p.flim = [60 400]; % range of feasible fundamental frequencies (Hz) p.w = dpwtdef; % DP weights % p.rampk = 1.1; % constant for relative-amplitude cost term % p.rampcz = 100; % relative amplitude cost for missing peak p.tmf = 2; % median frequency smoothing interval (s) p.tinc = 0.01; % default frame increment (s) % update parameters from pp argument if nargin>=5 && isstruct(pp) fnq=fieldnames(pp); for i=1:length(fnq) if isfield(p,fnq{i}) p.(fnq{i})=pp.(fnq{i}); end end end % Sort out input arguments if nargin>=3 && numel(tinc)>0 p.tinc = tinc; % 0.01 s between consecutive time frames end if nargin<4 m=''; end % Spectrogram of the mixture fmin = 0; fstep = p.fstep; fmax = p.fmax; fres = p.fres; % Frequency resolution (Hz) [tx,f,MIX]=spgrambw(s,fs,fres,[fmin fstep fmax],[],p.tinc); nframes=length(tx); txinc=tx(2)-tx(1); % actual frame increment % ==== we could combine spgrambw and filtbankm into a single call to spgrambw or use fft directly ==== % Log-frequency scale [trans,cf]=filtbankm(length(f),2*length(f)-1,2*f(end),p.fbanklo,f(end),'usl'); O = MIX*trans'; % Original spectrum in Log-frequency scale %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Amplitude Compression % Calculate alpha based on LTASS ratios ltass = stdspectrum(6,'p',cf); auxf = [cf(1),(cf(1:end-1)+cf(2:end))./2,cf(end)]; ltass = ltass.*diff(auxf); % weight by bin width % estimated ltass O = O.*repmat(diff(auxf),nframes,1); % weight spectrum by bin width if tx(end)<p.shortut % if it is a short utterance eltass = mean(O,1); % mean power per each frequency band eltass = smooth(eltass,p.mpsmooth); % smooth in log frequency eltass= eltass(:).'; % force a row vector % Same mean power per frame as ltass cte = mean(ltass)/mean(eltass); eltass = eltass.*cte; % normalize to have the same mean as LTASS O = O.*cte; % Linear AC alpha = (ltass)./(eltass); alpha = alpha(:).'; alpha = repmat(alpha,nframes,1); O = O.*alpha; % force O to have an average LTASS spectrum % ==== should perhaps exclude the silent portions *** else % long utterance tsmo = 2; % time smoothing over 1 sec stt = round(tsmo/txinc); filttime = [ones(stt,1); zeros(stt-1,1)]; filtfreq = ones(1,p.mpsmooth); eltass = imfilter(O,filttime); eltass = imfilter(eltass,filtfreq); % filter in time and log frequency % Same mean power per frame than ltass cte = repmat(mean(ltass),nframes,1)./mean(eltass,2); eltass = eltass.*repmat(cte,1,length(cf)); O = O.*repmat(cte,1,length(cf)); % Linear AC alpha = repmat(ltass,nframes,1)./(eltass); O = O.*alpha; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Create the filter to detect the harmonics ini = find(cf>2*cf(1)); sca = cf/cf(ini(1)); % normalize bin frequencies to start at approximately 0.5 sca = sca(sca<10.5 & sca>0.5); % restrict to 0.5 - 10.5 times fundamental filh = -log10(p.pefact-cos(2*pi*sca)); filh = filh-mean(filh); % force filter to be zero mean posit = find(sca>=1); % ==== this should just equal ini(1) ==== if ~mod(length(posit),2) filh = [filh 0]; % force to be an odd length after central tap end negat = find(sca<1); numz = length(posit)-1-length(negat); filh = filh./max(filh); filh = [zeros(1,numz) filh]; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Filter the log-frequency scaled spectrogram B = imfilter(O,filh); % ==== no good reason to use imfilter here ==== % Feasible frequency range numopt = p.numopt; % Number of possible fundamental frequencies per frame flim = p.flim; pfreq = find(cf>flim(1) & cf<flim(2)); ff = zeros(nframes,numopt); amp = zeros(nframes,numopt); for i=1:nframes [pos,peak]=findpeaks(B(i,pfreq),[],5/(cf(pfreq(2))-cf(pfreq(1)))); % ==== calculate some out of loop ==== if numel(pos) [peak,ind]=sort(peak,'descend'); pos = pos(ind); % indices of peaks in the B array posff = cf(pfreq(pos)); % frequencies of peaks fin = min(numopt,length(posff)); ff(i,1:fin)=posff(1:fin); % save both frequency and amplitudes amp(i,1:fin)=peak(1:fin); % else % ff(i,:)=0; % ==== unnecessary since they start as zeros ==== % amp(i,:)=0; end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Probabilitly of the frame of being voiced % voiced/unvoiced decision based on 2-element feature vector % (a) mean power of the frame's log-freq spectrum (normalized so its short-term average is LTASS) % (b) sum of the power in the first three peaks pow = mean(O,2)*1e-6; vuvfea = [pow sum(amp,2)./(pow*1e9)]; pru=gaussmixp(vuvfea,m_u,v_u,w_u); % Probability of being unvoiced prv=gaussmixp(vuvfea,m_v,v_v,w_v); % Probability of being voiced % pru = exp(pru); % prv=exp(prv); % Linear probability % pv = prv./(prv+pru); % ==== better to write pv=(1+exp(pru-prv)).^(-1) ==== pv=(1+exp(pru-prv)).^(-1); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Dynamic programming % w(1): relative amp, voiced local cost % w(2): median pitch deviation cost % w(3): df cost weight % w(4): max df cost % w(5): relative amp cost for missing peaks (very high) % w(6): df mean w = p.w; % Relative amplitude camp = -amp./repmat(max(amp,[],2),1,numopt); % relative amplitude used as cost camp(amp==0)=w(5); % If no frequency found % Time interval for the median frequency tmf = p.tmf; % in sec inmf = round(tmf/txinc); %-------------------------------------------------------------------------- % FORWARDS % Initialize values cost = zeros(nframes,numopt); prev = zeros(nframes,numopt); medfx = zeros(nframes,1); dffact=2/txinc; % First time frame % cost(1,:) = w(1)*ramp(1,:); cost(1,:) = w(1)*camp(1,:); % only one cost term for first frame fpos = ff(1:min(inmf,end),1); mf=median(fpos(pv(1:min(inmf,end))>0.6)); % calculate median frequency of first 2 seconds if isnan(mf) mf=median(fpos(pv(1:min(inmf,end))>0.5)); if isnan(mf) mf=median(fpos(pv(1:min(inmf,end))>0.4)); if isnan(mf) mf=median(fpos(pv(1:min(inmf,end))>0.3)); % ==== clumsy way of ensuring that we take the best frames ==== if isnan(mf) mf=0; end end end end medfx(1)=mf; for i=2:nframes % main dynamic programming loop if i>inmf fpos = ff(i-inmf:i,1); % fpos is the highest peak in each frame mf=median(fpos(pv(1:inmf)>0.6)); % find median frequency over past 2 seconds if isnan(mf) mf=median(fpos(pv(1:inmf)>0.5)); if isnan(mf) mf=median(fpos(pv(1:inmf)>0.4)); if isnan(mf) mf=median(fpos(pv(1:inmf)>0.3));% ==== clumsy way of ensuring that we take the best frames ==== if isnan(mf) mf=0; end end end end end medfx(i)=mf; % Frequency difference between candidates and cost df = dffact*(repmat(ff(i,:).',1,numopt) - repmat(ff(i-1,:),numopt,1))./(repmat(ff(i,:).',1,numopt) + repmat(ff(i-1,:),numopt,1)); costdf=w(3)*min((df-w(6)).^2,w(4)); % Cost related to the median pitch if mf==0 % this test was inverted in the original version costf = zeros(1,numopt); else costf = abs(ff(i,:) - mf)./mf; end [cost(i,:),prev(i,:)]=min(costdf + repmat(cost(i-1,:),numopt,1),[],2); % ==== should we allow the possibility of skipping frames ? ==== cost(i,:)=cost(i,:)+w(2)*costf + w(1)*camp(i,:); % add on costs that are independent of previous path end % Traceback fx=zeros(nframes,1); best = zeros(nframes,1); nose=find(cost(end,:)==min(cost(end,:))); % ==== bad method (dangerous) === best(end)=nose(1); % ff = [ff zeros(nframes,1)]; % not clear why this was here fx(end)=ff(end,best(end)); for i=nframes:-1:2 best(i-1)=prev(i,best(i)); fx(i-1)=ff(i-1,best(i-1)); end if nargout>=4 fv.vuvfea=vuvfea; % voiced-unvoiced features fv.best=best; % selected path fv.ff=ff; % pitch candidates fv.amp=amp; % pitch candidate amplitudes fv.medfx=medfx; % median pitch fv.w=w; % DP weights fv.dffact=dffact; % df scale factor end if ~nargout || any(m=='g') || any(m=='G') nax=0; % number of axes sets to link msk=pv>0.5; % find voiced frames as a mask fxg=fx; fxg(~msk)=NaN; % allow only good frames fxb=fx; fxb(msk)=NaN; % allow only bad frames if any(m=='G') || ~nargout && ~any(m=='g') clf; spgrambw(s,fs,'ilcwpf'); % draw spectrogram with log axes hold on plot(tx,log10(fxg),'-b',tx,log10(fxb),'-r'); % fx track yy=get(gca,'ylim'); plot(tx,yy(1)+yy*[-1;1]*(0.02+0.05*pv),'-k'); % P(V) track hold off nax=nax+1; axh(nax)=gca; if any(m=='g') figure; % need a new figure if plotting two graphs end end if any(m=='g') ns=length(s); [tsr,ix]=sort([(1:ns)/fs 0.5*(tx(1:end-1)+tx(2:end))']); % intermingle speech and frame boundaries jx(ix)=1:length(ix); % create inverse index sp2fr=jx(1:ns)-(0:ns-1); % speech sample to frame number spmsk=msk(sp2fr); % speech sample voiced mask sg=s; sg(~spmsk)=NaN; % good speech samples only sb=s; sb(spmsk)=NaN; % bad speech samples only clf; subplot(5,1,1); plot(tx,pv,'-b',(1:ns)/fs,0.5*mod(cumsum(fx(sp2fr)/fs),1)-0.6,'-b'); nax=nax+1; axh(nax)=gca; ylabel('\phi(t), P(V)'); set(gca,'ylim',[-0.65 1.05]); subplot(5,1,2:3); plot((1:ns)/fs,sg,'-b',(1:ns)/fs,sb,'-r'); nax=nax+1; axh(nax)=gca; subplot(5,1,4:5); plot(tx,fxg,'-b',tx,fxb,'-r'); ylabel('Pitch (Hz)'); % semilogy(tx,fxg,'-b',tx,fxb,'-r'); % ylabel(['Pitch (' yticksi 'Hz)']); set(gca,'ylim',[min(fxg)-30 max(fxg)+30]); nax=nax+1; axh(nax)=gca; end if nax>1 linkaxes(axh,'x'); end end function y=smooth(x,n) nx=length(x); c=cumsum(x); y=[c(1:2:n)./(1:2:n) (c(n+1:end)-c(1:end-n))/n (c(end)-c(end-n+2:2:end-1))./(n-2:-2:1)];